Dual CRISPR Screen Analysis

Step 6: Scoring Preparation

Amanda Birmingham, CCBB, UCSD (abirmingham@ucsd.edu)

Instructions

To run this notebook reproducibly, follow these steps:

  1. Click Kernel > Restart & Clear Output
  2. When prompted, click the red Restart & clear all outputs button
  3. Fill in the values for your analysis for each of the variables in the Input Parameters section
  4. Click Cell > Run All

Input Parameters


In [ ]:
g_dataset_name = "Notebook6Test"
g_library_fp = '~/dual_crispr/library_definitions/test_library_2.txt'
g_count_fps_or_dirs = '/home/ec2-user/dual_crispr/test_data/test_set_6a,/home/ec2-user/dual_crispr/test_data/test_set_6b'
g_time_prefixes = "T,D"
g_prepped_counts_run_prefix = ""
g_prepped_counts_dir = '~/dual_crispr/test_outputs/test_set_6'

Automated Set-Up


In [ ]:
import inspect

import ccbb_pyutils.analysis_run_prefixes as ns_runs
import ccbb_pyutils.files_and_paths as ns_files
import ccbb_pyutils.notebook_logging as ns_logs


def describe_var_list(input_var_name_list):
    description_list =  ["{0}: {1}\n".format(name, eval(name)) for name in input_var_name_list]
    return "".join(description_list)


ns_logs.set_stdout_info_logger()

In [ ]:
import dual_crispr.count_combination as ns_combine
print(inspect.getsource(ns_combine.get_combined_counts_file_suffix))

In [ ]:
import ccbb_pyutils.string_utils as ns_string
print(inspect.getsource(ns_string.split_delimited_string_to_list))

In [ ]:
import os

def get_count_file_fps(comma_sep_fps_or_dirs_str):
    result = []

    fps_or_dirs = comma_sep_fps_or_dirs_str.split(",")
    for curr_fp_or_dir in fps_or_dirs:
        trimmed_curr = curr_fp_or_dir.strip()
        trimmed_curr = ns_files.expand_path(trimmed_curr)
        if os.path.isdir(trimmed_curr):
            combined_counts_fps = ns_files.get_filepaths_from_wildcard(trimmed_curr,
                ns_combine.get_combined_counts_file_suffix())
            result.extend(combined_counts_fps)
        else:
            result.append(trimmed_curr)

    return result


g_library_fp = ns_files.expand_path(g_library_fp)
g_count_file_fps = get_count_file_fps(g_count_fps_or_dirs)
g_prepped_counts_run_prefix = ns_runs.check_or_set(g_prepped_counts_run_prefix,
                                                   ns_runs.generate_run_prefix(g_dataset_name))
g_time_prefixes_list = ns_string.split_delimited_string_to_list(g_time_prefixes)
g_prepped_counts_dir = ns_files.expand_path(g_prepped_counts_dir)

print(describe_var_list(['g_library_fp', 'g_count_file_fps', 'g_prepped_counts_run_prefix', 'g_time_prefixes_list']))
ns_files.verify_or_make_dir(g_prepped_counts_dir)

Scoring-Ready File Preparation


In [ ]:
import dual_crispr.scoring_prep as ns_prep
print(inspect.getsource(ns_prep))

In [ ]:
def merge_and_write_timepoint_counts(count_file_fps, constructs_fp, run_prefix, dataset_name, time_prefixes_list,
                                     output_dir, disregard_order=True):   
    
    joined_df = ns_prep.merge_and_annotate_counts(count_file_fps, constructs_fp, dataset_name, 
        time_prefixes_list, disregard_order=True)
    prepped_file_suffix = ns_prep.get_prepped_file_suffix()
    output_fp = ns_files.build_multipart_fp(output_dir, [run_prefix, prepped_file_suffix])
    joined_df.to_csv(output_fp, index=False, sep='\t')

In [ ]:
merge_and_write_timepoint_counts(g_count_file_fps, g_library_fp, g_prepped_counts_run_prefix, g_dataset_name,
                                 g_time_prefixes_list, g_prepped_counts_dir, True)

In [ ]:
print(ns_files.check_file_presence(g_prepped_counts_dir, g_prepped_counts_run_prefix, 
                                   ns_prep.get_prepped_file_suffix(),
                                   check_failure_msg="Scoring preparation failed to produce an output file."))